{
 "metadata": {
  "name": "",
  "signature": "sha256:2fcd22cfa22302ee1811b8a0e30c87ef75f833e0d94733c684ed6d188ce986b7"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from collections import OrderedDict\n",
      "from time import time"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data_file = \"/nfs/data/KDD99/kddcup.data\"\n",
      "raw_data = sc.textFile(data_file)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# count by all different labels and print them decreasingly\n",
      "print \"Counting all different labels\"\n",
      "labels = raw_data.map(lambda line: line.strip().split(\",\")[-1])\n",
      "\n",
      "t0 = time()\n",
      "label_counts = labels.countByValue()\n",
      "tt = time()-t0\n",
      "\n",
      "sorted_labels = OrderedDict(sorted(label_counts.items(), key=lambda t: t[1], reverse=True))\n",
      "for label, count in sorted_labels.items():\n",
      "    print label, count\n",
      "    \n",
      "print \"Counted in {} seconds\".format(round(tt,3))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Counting all different labels\n",
        "smurf."
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 2807886\n",
        "neptune. 1072017\n",
        "normal. 972781\n",
        "satan. 15892\n",
        "ipsweep. 12481\n",
        "portsweep. 10413\n",
        "nmap. 2316\n",
        "back. 2203\n",
        "warezclient. 1020\n",
        "teardrop. 979\n",
        "pod. 264\n",
        "guess_passwd. 53\n",
        "buffer_overflow. 30\n",
        "land. 21\n",
        "warezmaster. 20\n",
        "imap. 12\n",
        "rootkit. 10\n",
        "loadmodule. 9\n",
        "ftp_write. 8\n",
        "multihop. 7\n",
        "phf. 4\n",
        "perl. 3\n",
        "spy. 2\n",
        "Counted in 6.866 seconds\n"
       ]
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}